#==============================================================================#
# 2-extract-profiles-m.R
# Matt Curtis 25/08/22
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
# extract profile ids from marriages
# updated feb 2 2023
# updated may 29, 2025
#==============================================================================#

rm(list = ls())
gc()
#library(tidyverse)
#library(data.table)
library(stringr)
library(arrow)
library(tidytable)
#library(tidytable)
#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# set working directory once
setwd("~/Replication package")

#------------------------------------------------------------------------------#

# list of terms to flag as france
france_good <-
  c(
    "france",
    "fr",
    "francia",
    "frankreich",
    "franc",
    "franca",
    "francja",
    "frankrijk",
    "fra",
    "franciaorszag",
    "fran",
    "frankrike",
    "franche"
  )
france_bad <-
  c("guiana",
    "guyane",
    "nouvelle france",
    "new france",
    "canada")
#------------------------------------------------------------------------------#
print("Loading details")

# find the number of rows
# only do once!
#geni<-fread("./1_raw_data/1_2_geni/geni_union_details_patched.csv"),
#            showProgress=TRUE,
#            select = c(1))
#print(nrow(geni))

# = 50694248

#------------------------------------------------------------------------------#
# split the data in to chunks

chunk_n<-0
max_chunk<-10

while(chunk_n<=max_chunk){
  print(paste0("Reading chunk ",chunk_n+1,"/",max_chunk+1))
  
  
  fname<-paste0("./1_raw_data/1_2_geni/1_2_geni_chunked/geni_union_details/part-",chunk_n,".parquet")
  geni<-read_parquet(fname,col_select=c(1,2,3,4,5,6,11,12,13,14,15)) %>% 
    tidytable() 
  gc()
  geni<-filter(geni,
               (marriage_start_year>=1500&marriage_start_year<=1900)|
                 is.na(marriage_start_year))
  
  # clean location strings
  geni <- geni %>%
    mutate(
      across(
        c(
          ends_with("_place_name"),
          ends_with("_city"),
          ends_with("_county"),
          ends_with("_state"),
          ends_with("_country")
        ),
        ~ . %>%
          iconv(to = 'ASCII//TRANSLIT') %>% # remove accents
          str_to_lower() %>%
          str_replace_all("[^[a-z]+]", " ") %>% # remove everything but letters
          str_squish() %>% # but keep white space
          replace(is.na(.), "")
      )
    )
  
  # this function takes a string X and pastes together X_place_name etc.
  # it is unecessarially fancy because I was bored
  # paste together all the locations
  geni <- geni %>%
    mutate(
      marriage_loc = paste(
        marriage_city,
        marriage_county,
        marriage_state,
        marriage_country,
        marriage_place_name,
        sep = ", "
      ) %>%
        str_remove_all(", , ") %>%
        str_remove_all("^, ")  %>%
        str_remove_all(", $")
    )
  geni <- mutate(geni, married_france = 0) 
  
  for (word in france_good) {
    #\b is regex for word boundary
    geni <- mutate(geni,
                   married_france = replace(married_france,
                                            str_detect(marriage_loc, paste0(
                                              "\\b(", word, ")\\b"
                                            )), 1)) 
  }
  #rm(pb)
  #gc()
  
  for (word in france_bad) {
    #\b is regex for word boundary
    geni <- mutate(geni,
                   married_france = replace(married_france,
                                            str_detect(marriage_loc, paste0(
                                              "\\b(", word, ")\\b"
                                            )), 0)) 
  }
  #------------------------------------------------------------------------------#
  
  geni<-filter(geni,married_france==TRUE)
  
  #----------------------------------------------------------------------------#
  print(paste0("Saving: ",chunk_n+1,"/",max_chunk+1))
  
  geni <-geni %>% 
    filter(married_france == 1)
  write_parquet(geni, paste0("./2_scripts/2_0_tempfiles/fr-mchunk_",chunk_n,"_",max_chunk,".parquet"))
  gc()
  
  
  # prepare for next round
  chunk_n<-chunk_n+1
}


#------------------------------------------------------------------------------#

#------------------------------------------------------------------------------#
# we only have union_ids, we also want profile_ids
# load the full unions file but just union_id profile_id


files<- list.files(pattern = "fr-mchunk.*parquet",path="./2_scripts/2_0_tempfiles")
first<-TRUE
for(file in files){
  print(file)
  temp<-  read_parquet(paste0("./2_scripts/2_0_tempfiles/",file),
                              as_data_frame=FALSE)
  
  if(first){
    output<-temp
    first<-FALSE
  }else{
    output<-rbind(output,temp)
  }
  #  file.remove(paste0("./2_scripts/2_0_tempfiles/",file))
}


chunk_n<-0
max_chunk<-16
first<-TRUE
pids<-data.table()
while(chunk_n<=max_chunk){
  print(paste0("Reading chunk ",chunk_n+1,"/",max_chunk+1))
  fname<-paste0("./1_raw_data/1_2_geni/1_2_geni_chunked/geni_unions/part-",chunk_n,".parquet")
  geni<-read_parquet(fname,col_select=c(1,2),as_data_frame=FALSE)
  geni<-dplyr::left_join(output,geni) %>% 
    dplyr::compute()
  if(first){
    pids<-geni
    first<-FALSE
  }else{
    pids<-rbind(pids,geni)
  }
  chunk_n<-chunk_n+1
}
gc()
pids<-pids %>% 
  dplyr::collect() %>% 
  distinct()


write_parquet(pids,'./2_scripts/2_0_tempfiles/fr-pids-m.parquet')
#------------------------------------------------------------------------------#